# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# Bootstrap analysis of efficacy of the Pfizer-BioNTech COVID-19 Vaccine

# The Phase III clinical trial of Pfizer vaccine included n=44000 people,

# 50% took trt, 50% placebo.

# Result: 8 vaccinated and 162 placebo participants contracted COVID-19.

# A conclusion was made that vaccination reduces the risk of COVID by

# the factor of 20.

# Vaccine critics point to a small sample size of 170 for any meaningful

# conclusions. Here, we explore what confidence statements we can deduce

# from these data.

# Construct a Bootstrap CI for p1/p2 = the ratio of probabilities

# p1 = P( infection | placebo group ), p2 = P( infection | vaccine group )

V = c(rep(0,22000),rep(1,22000)); # 0 = placebo, 1 = treatment

X = c(rep(0,21838),rep(1,162),rep(0,21992),rep(1,8)); # 1 = contracted covid

XV = data.frame(X,V);

pratio = function(xv,Z){ # function of data xv and a subsample Z

x = xv[,1]; v = xv[,2];

Pplacebo = sum(x[Z]==1 & v[Z]==0)/sum(v[Z]==0);

Pvaccine = sum(x[Z]==1 & v[Z]==1)/sum(v[Z]==1);

return( Pplacebo/Pvaccine )

}

pratio(XV,)

# The ratio is estimated as 20.25. We need to supply it with the margin of error

# and a confidence level. That is, use Bootstrap to construct a confidence interval.

library(boot)

b = boot( XV, pratio, R=10000 ) # Apply bootstrap

quantile(b$t,c(0.025,0.975)) # This is a 95% confidence interval for p1/p2

# 2.5% 97.5% # Thus, we can claim with 95% confidence that the ratio of

# 11.34329 54.51769 # probabilities is between 11.3 and 54.5. But do we need an interval,

# or should we just find a lower bound?

quantile(b$t,0.05) # Try a one-sided confidence interval.

# 5% # We can now claim, also with 95% confidence, that vaccination

# 12.29509 # reduces the chance of COVID-19 infection by at least a factor of 12.

################################################################################

### Official efficacy of the vaccine ###

### The efficacy of a vaccine is defined as the difference of risks between ###

### the vaccine group and the placebo group, relative to the placebo group. ###

################################################################################

### To study the efficacy of Pfizer, we modify our function:

efficacy = function(xv,Z){ # function of data xv and a subsample Z

x = xv[,1]; v = xv[,2];

Pplacebo = sum(x[Z]==1 & v[Z]==0)/sum(v[Z]==0);

Pvaccine = sum(x[Z]==1 & v[Z]==1)/sum(v[Z]==1);

return( (Pplacebo - Pvaccine)/Pplacebo ) }

pratio(XV,) # This gives the estimated efficacy which you see in all official reports

# 0.9506173 # But we know that it is only based on the sample, so it has a margin of error.

# So, we proceed by building a confidence interval

b = boot( XV, efficacy, 10000 ) # Bootstrap!

hist(b$t) # This is a histogram of bootstrap efficacy values, it's not very symmetric

# I'm just checking normality, to see if we can settle with a PARAMETRIC

# bootstrap confidence interval, which is the mean b$t, plus or minus 1.96

# standard deviations

shapiro.test(b$t) # Shapiro-Wilk test is a rigorous test of normality

Error in shapiro.test(b$t) : sample size must be between 3 and 5000

# The way it's built in R does not handle samples > 5000

shapiro.test(b$t[sample(10000,5000)]) # Ok, we select a random sample of n=5000

# Shapiro-Wilk normality test

# data: b$t[sample(10000, 5000)]

# W = 0.98926, p-value < 2.2e-16 # Shapiro-Wilk test rejects normal distribution.

# Hence, we construct a NONPARAMETRIC

quantile( b$t, c(0.025,0.975) ) # bootstrap confidence interval

# 2.5% 97.5%

# 0.9105982 0.9816955 # This is a 95% confidence interval for efficacy

quantile( b$t, c(0.005,0.995) )

0.5% 99.5%

0.8947269 0.9882617 # This is a 99% confidence interval